
********************************************************************************
*
* This program takes the set of strongly connected establishments and prepares
* files to send to Matlab to estimate the Sorkin (QJE 2018) model.
*
* There are two output files: 
* connected_moves.csv
* params.csv  
*
********************************************************************************

*-------------------------------------------------------------------------------
* Full dataset
*-------------------------------------------------------------------------------

*Add indicator of belonging to largest strongly connected to the estab matrix id file

import delimited using "$matlab\sconn_idfy\sconn_idfy_ver$ver.csv", clear
rename v1 matrix_estabid
rename v2 in_strconn
merge 1:1 matrix_estabid using "$temp/estab_matrix_ids_orig_ver$ver.dta"
//few obs are lost due to the restrictions imposed in str_conn_input
replace in_strconn=0 if _merge!=3
drop _merge
cap rename a betnr
gsort -in_strconn matrix_estabid
gen model_estabid = _n
save "$temp/estab_matrix_ids_ver$ver.dta", replace

*Calculate f_o for the strongly connected set
use "$temp/str_conn_input_ver$ver", clear
drop if currentid != 1
gen double betnr = toid
merge m:1 betnr using "$temp/estab_matrix_ids_ver$ver", keepus(in_strconn)
//the only element in strconn not merged is 1=N which doesnt hire from N
tab _merge in_strconn
keep if in_strconn==1
keep if _merge==3
drop _merge
gen byte flow = 1 
bysort betnr: egen M_in = sum(flow)
egen sum_nj = sum(flow)
gen double f_o = M_in/sum_nj
duplicates drop betnr, force 
keep betnr f_o
count
save "$temp/fo_new_ver$ver", replace


*Re-input the trimmed movers file
use "$temp/str_conn_input_ver$ver", clear

*Get sender in strongly connected set flag
rename currentid estabid
merge m:1 estabid using "$temp/estab_matrix_ids_ver$ver", ///
keepus(matrix_estabid model_estabid in_strconn)
*_merge==1 are receivers who never send - drop these estabs
keep if _merge==3
drop _merge 
rename matrix_estabid matrix_currentid
rename model_estabid model_currentid
rename estabid currentid
rename in_strconn sender_insconn

*Get receiver in strongly connected set flag 
rename toid estabid
merge m:1 estabid using "$temp/estab_matrix_ids_ver$ver", ///
keepus(matrix_estabid model_estabid in_strconn)
replace currentid = 1 if _merge==2
replace matrix_currentid = 1 if _merge==2
replace model_currentid = 1 if _merge==2
replace sender_insconn=1 if _merge==2
drop _merge
rename estabid toid
rename matrix_estabid matrix_toid 
rename model_estabid model_toid
rename in_strconn recvr_insconn

*check for square matrix in str_conn set 
count if sender_insconn==1 & recvr_insconn==1
sum model_currentid if sender_insconn==1 & recvr_insconn==1, d
sum model_toid if sender_insconn==1 & recvr_insconn==1, d

*Restrict moves to strongly connected set
gen strongly_connected= (sender_insconn==1 & recvr_insconn==1)
tab strongly_connected
keep if strongly_connected==1

*Add offer distribution data (f_o)
rename currentid betnr
merge m:1 betnr using "$temp/fo_new_ver$ver"
assert betnr==1 if _merge!=3
tab _merge
rename betnr currentid
drop _merge
compress
save "$temp/model_input_new_ver$ver", replace


*Export moves to csv for model estimation in matlab
rename currentid betnr 
merge m:1 betnr year q using "$results/estab_eeenrates_byyr_ver$ver", keepusing(eerate_avgall enrate_avgall bin)
drop if _merge == 2
drop _merge

rename betnr currentid
merge m:1 currentid year using "$temp\workers_currentid_year_ver$ver"
drop if _merge == 2
drop _merge

rename currentid betnr 

bysort betnr year: gen tag = (_n == 1)
bysort betnr: egen double tot_empl = sum(workers) if tag == 1
egen double W = sum(tag * workers) // total number of workers ever employed in the str connected set
gen double g_temp = tot_empl/W
bysort betnr: egen g = mean(g_temp) // relative size of establishment
drop g_temp tag workers

rename betnr currentid

sum W
scalar W = r(mean)
gen double i=.
gen double j=.
replace i = toid
replace j = currentid
drop currentid toid
keep i j g eerate_avgall enrate_avgall f_o bin year
*Replace bin equal to some positive number so that all NE transitions are endog
assert bin==. if j==1
replace bin=0.5 if bin==.
order i j eerate_avgall enrate_avgall g f_o bin

preserve
keep i
duplicates drop
rename i j 
save "$temp/test01_hs", replace
restore
merge m:1 j using "$temp/test01_hs"
assert _merge == 3
drop _merge
erase "$temp/test01_hs.dta"

replace i = 2 if i == 1
replace j = 2 if j == 1

assert !missing(eerate_avgall) if j!=2
assert !missing(enrate_avgall) if j!=2
assert !missing(f_o) if j!=2

* add industry codes (to calculate delta and roh by sector)
rename j betnr
merge m:1 betnr using "$temp/wz.dta", keepusing(ind sector) 
drop if _merge == 2
drop _merge
rename betnr j

sort ind
egen sector_num = group(ind)
su sector_num
global max_ind = r(max)
replace sector_num = $max_ind + 1 if missing(ind) & j != 2 // ee- or en-flows with missing industry
replace sector_num = $max_ind + 2 if missing(ind) & j == 2 // ne-flows with missing industry

drop sector ind year
rename sector_num sector

export delimited using "$matlab\data_new\connected_moves_ver$ver.csv", datafmt replace


*Create vector of parameters for model estimation in matlab
*EE-EN function 
use "$results/estab_eeenrates_byyr_ver$ver", clear

duplicates drop bin, force 
*Initial values of endogenous EN and EE rates (uses all periods all firms)
sum enrate_avgall if bin>0
scalar endog_enrate = r(mean)
sum eerate_avgall if bin>0
scalar endog_eerate = r(mean)

clear
matrix params = J(3,1,.)
matrix params[1,1] = W
matrix params[2,1] = endog_enrate
matrix params[3,1] = endog_eerate
svmat params
rename params1 params

format params %16.7g


export delimited using "$matlab\data_new\params_ver$ver.csv", datafmt replace
